# The usual importsimport altair as altimport geopandas as gpdimport numpy as npimport pandas as pdfrom matplotlib import pyplot as plt# Show all columns in dataframespd.options.display.max_columns =999# Hide warnings due to issue in shapely package # See: https://github.com/shapely/shapely/issues/1345np.seterr(invalid="ignore");import osmnx as oxNYC = gpd.read_file("new-york-city-boroughs.geojson")NYC_M = NYC[NYC["name"] =="Manhattan"]NYC_M
name
cartodb_id
created_at
updated_at
geometry
3
Manhattan
4
2013-03-09 02:42:03.692000+00:00
2013-03-09 02:42:03.989000+00:00
MULTIPOLYGON (((-74.01093 40.68449, -74.01193 ...
2 Get the street network graph
# Project it to Web Mercator first and plotax = NYC_M.to_crs(epsg=4326).plot(facecolor="none", edgecolor="black")ax.set_axis_off()
3 Convert your network graph edges to a GeoDataFrame
# Define your polygon boundary (replace with your actual polygon)# For example, if you have a GeoDataFrame with your area boundary:polygon = NYC_M.unary_union # Create a street network graphG = ox.graph_from_polygon(polygon, network_type='drive')# Convert graph edges to a GeoDataFrameedges_gdf = ox.graph_to_gdfs(G, nodes=False, edges=True)# Display the first few rows of the GeoDataFrameprint(edges_gdf.head())# Plot the edges GeoDataFramefig, ax = plt.subplots(figsize=(20, 20))edges_gdf.plot(ax=ax, linewidth=1, edgecolor='black')plt.show()
osmid name \
u v key
42421728 42435337 0 195743153 Central Park West
42421731 0 [420625565, 420625573, 5668966] West 106th Street
42432736 0 [1271523197, 1271523198] Central Park West
42421731 42437916 0 5671485 Manhattan Avenue
42432737 0 195743186 Manhattan Avenue
highway maxspeed oneway reversed length \
u v key
42421728 42435337 0 secondary 25 mph False True 85.345
42421731 0 secondary NaN False False 138.033
42432736 0 secondary 25 mph False False 86.275
42421731 42437916 0 residential NaN False True 86.149
42432737 0 residential NaN False False 85.968
geometry \
u v key
42421728 42435337 0 LINESTRING (-73.96004 40.79805, -73.96011 40.7...
42421731 0 LINESTRING (-73.96004 40.79805, -73.96017 40.7...
42432736 0 LINESTRING (-73.96004 40.79805, -73.95997 40.7...
42421731 42437916 0 LINESTRING (-73.96147 40.79865, -73.96154 40.7...
42432737 0 LINESTRING (-73.96147 40.79865, -73.96140 40.7...
lanes ref access bridge tunnel width junction
u v key
42421728 42435337 0 NaN NaN NaN NaN NaN NaN NaN
42421731 0 NaN NaN NaN NaN NaN NaN NaN
42432736 0 NaN NaN NaN NaN NaN NaN NaN
42421731 42437916 0 NaN NaN NaN NaN NaN NaN NaN
42432737 0 NaN NaN NaN NaN NaN NaN NaN
4 Load Crash Data
# Load data into a pandas DataFramedata = pd.read_csv("Motor_Vehicle_Collisions_Crashes.csv")
data
CRASH DATE
CRASH TIME
BOROUGH
ZIP CODE
LATITUDE
LONGITUDE
LOCATION
ON STREET NAME
CROSS STREET NAME
OFF STREET NAME
NUMBER OF PERSONS INJURED
NUMBER OF PERSONS KILLED
NUMBER OF PEDESTRIANS INJURED
NUMBER OF PEDESTRIANS KILLED
NUMBER OF CYCLIST INJURED
NUMBER OF CYCLIST KILLED
NUMBER OF MOTORIST INJURED
NUMBER OF MOTORIST KILLED
CONTRIBUTING FACTOR VEHICLE 1
CONTRIBUTING FACTOR VEHICLE 2
CONTRIBUTING FACTOR VEHICLE 3
CONTRIBUTING FACTOR VEHICLE 4
CONTRIBUTING FACTOR VEHICLE 5
COLLISION_ID
VEHICLE TYPE CODE 1
VEHICLE TYPE CODE 2
VEHICLE TYPE CODE 3
VEHICLE TYPE CODE 4
VEHICLE TYPE CODE 5
0
05/01/2021
13:30
MANHATTAN
10029.0
40.796300
-73.938290
(40.7963, -73.93829)
EAST 115 STREET
2 AVENUE
NaN
0
0
0
0
0
0
0
0
Passing or Lane Usage Improper
Unspecified
NaN
NaN
NaN
4412937
Bus
Sedan
NaN
NaN
NaN
1
05/01/2021
17:50
MANHATTAN
10012.0
40.720936
-73.993805
(40.720936, -73.993805)
BOWERY
SPRING STREET
NaN
1
0
0
0
0
0
1
0
Driver Inattention/Distraction
Unspecified
NaN
NaN
NaN
4412445
Sedan
Sedan
NaN
NaN
NaN
2
05/01/2021
13:30
MANHATTAN
10128.0
40.780693
-73.946600
(40.780693, -73.9466)
EAST 92 STREET
1 AVENUE
NaN
0
0
0
0
0
0
0
0
Driver Inattention/Distraction
Unspecified
NaN
NaN
NaN
4414390
AMBULANCE
Sedan
NaN
NaN
NaN
3
05/01/2021
9:40
MANHATTAN
10026.0
40.800537
-73.948360
(40.800537, -73.94836)
NaN
NaN
40 WEST 115 STREET
0
0
0
0
0
0
0
0
Backing Unsafely
Unspecified
NaN
NaN
NaN
4417017
Station Wagon/Sport Utility Vehicle
NaN
NaN
NaN
NaN
4
05/01/2021
23:03
MANHATTAN
10009.0
40.726864
-73.979910
(40.726864, -73.97991)
AVENUE B
EAST 10 STREET
NaN
1
0
0
0
1
0
0
0
Driver Inattention/Distraction
Driver Inattention/Distraction
NaN
NaN
NaN
4412243
Bike
NaN
NaN
NaN
NaN
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
32364
12/31/2023
23:18
MANHATTAN
10030.0
40.819670
-73.944240
(40.81967, -73.94424)
8 AVENUE
WEST 140 STREET
NaN
0
0
0
0
0
0
0
0
Driver Inattention/Distraction
NaN
NaN
NaN
NaN
4692572
Sedan
NaN
NaN
NaN
NaN
32365
12/31/2023
18:03
MANHATTAN
10039.0
40.824130
-73.940980
(40.82413, -73.94098)
8 AVENUE
WEST 147 STREET
NaN
1
0
1
0
0
0
0
0
Unspecified
NaN
NaN
NaN
NaN
4692571
NaN
NaN
NaN
NaN
NaN
32366
12/31/2023
14:00
MANHATTAN
10028.0
40.777890
-73.955890
(40.77789, -73.95589)
NaN
NaN
160 EAST 84 STREET
0
0
0
0
0
0
0
0
Driver Inattention/Distraction
Unspecified
NaN
NaN
NaN
4692524
Sedan
Sedan
NaN
NaN
NaN
32367
12/31/2023
21:34
MANHATTAN
10033.0
40.849308
-73.931920
(40.849308, -73.93192)
WEST 182 STREET
AUDUBON AVENUE
NaN
0
0
0
0
0
0
0
0
Unspecified
Unspecified
NaN
NaN
NaN
4692192
Station Wagon/Sport Utility Vehicle
Sedan
NaN
NaN
NaN
32368
12/31/2023
0:38
MANHATTAN
10006.0
40.709496
-74.013900
(40.709496, -74.0139)
ALBANY STREET
WASHINGTON STREET
NaN
0
0
0
0
0
0
0
0
Other Vehicular
Unspecified
NaN
NaN
NaN
4692585
Sedan
Pick-up Truck
NaN
NaN
NaN
32369 rows × 29 columns
5 Convert the crash data to a GeoDataFrame
from shapely.geometry import Point# Check if the necessary columns existif'LATITUDE'in data.columns and'LONGITUDE'in data.columns:# Create a geometry column using the DEC_LAT and DEC_LONG columns geometry = [Point(xy) for xy inzip(data['LONGITUDE'], data['LATITUDE'])]# Create a GeoDataFrame gdf = gpd.GeoDataFrame(data, geometry=geometry)# Set the coordinate reference system (CRS) to WGS 84 (EPSG:4326) gdf.set_crs(epsg=4326, inplace=True)# Display the first few rows of the GeoDataFrameprint(gdf.head())else:print("The DataFrame does not contain 'DEC_LAT' and 'DEC_LONG' columns.")
CRASH DATE CRASH TIME BOROUGH ZIP CODE LATITUDE LONGITUDE \
0 05/01/2021 13:30 MANHATTAN 10029.0 40.796300 -73.938290
1 05/01/2021 17:50 MANHATTAN 10012.0 40.720936 -73.993805
2 05/01/2021 13:30 MANHATTAN 10128.0 40.780693 -73.946600
3 05/01/2021 9:40 MANHATTAN 10026.0 40.800537 -73.948360
4 05/01/2021 23:03 MANHATTAN 10009.0 40.726864 -73.979910
LOCATION ON STREET NAME CROSS STREET NAME \
0 (40.7963, -73.93829) EAST 115 STREET 2 AVENUE
1 (40.720936, -73.993805) BOWERY SPRING STREET
2 (40.780693, -73.9466) EAST 92 STREET 1 AVENUE
3 (40.800537, -73.94836) NaN NaN
4 (40.726864, -73.97991) AVENUE B EAST 10 STREET
OFF STREET NAME NUMBER OF PERSONS INJURED \
0 NaN 0
1 NaN 1
2 NaN 0
3 40 WEST 115 STREET 0
4 NaN 1
NUMBER OF PERSONS KILLED NUMBER OF PEDESTRIANS INJURED \
0 0 0
1 0 0
2 0 0
3 0 0
4 0 0
NUMBER OF PEDESTRIANS KILLED NUMBER OF CYCLIST INJURED \
0 0 0
1 0 0
2 0 0
3 0 0
4 0 1
NUMBER OF CYCLIST KILLED NUMBER OF MOTORIST INJURED \
0 0 0
1 0 1
2 0 0
3 0 0
4 0 0
NUMBER OF MOTORIST KILLED CONTRIBUTING FACTOR VEHICLE 1 \
0 0 Passing or Lane Usage Improper
1 0 Driver Inattention/Distraction
2 0 Driver Inattention/Distraction
3 0 Backing Unsafely
4 0 Driver Inattention/Distraction
CONTRIBUTING FACTOR VEHICLE 2 CONTRIBUTING FACTOR VEHICLE 3 \
0 Unspecified NaN
1 Unspecified NaN
2 Unspecified NaN
3 Unspecified NaN
4 Driver Inattention/Distraction NaN
CONTRIBUTING FACTOR VEHICLE 4 CONTRIBUTING FACTOR VEHICLE 5 COLLISION_ID \
0 NaN NaN 4412937
1 NaN NaN 4412445
2 NaN NaN 4414390
3 NaN NaN 4417017
4 NaN NaN 4412243
VEHICLE TYPE CODE 1 VEHICLE TYPE CODE 2 \
0 Bus Sedan
1 Sedan Sedan
2 AMBULANCE Sedan
3 Station Wagon/Sport Utility Vehicle NaN
4 Bike NaN
VEHICLE TYPE CODE 3 VEHICLE TYPE CODE 4 VEHICLE TYPE CODE 5 \
0 NaN NaN NaN
1 NaN NaN NaN
2 NaN NaN NaN
3 NaN NaN NaN
4 NaN NaN NaN
geometry
0 POINT (-73.93829 40.79630)
1 POINT (-73.99380 40.72094)
2 POINT (-73.94660 40.78069)
3 POINT (-73.94836 40.80054)
4 POINT (-73.97991 40.72686)
6 Trim the crash data to Center City
# Assuming edges_gdf is your GeoDataFrame from part 1.3manhattan_boundary = edges_gdf.geometry.unary_union.convex_hull# Filter the crash GeoDataFrame to only include crashes within the boundarymanhattan_crashes = gdf[gdf.geometry.within(manhattan_boundary)]# Display the number of crashes within the Center City boundaryprint(f"Number of crashes within manhattan: {len(manhattan_crashes)}")# Display the first few rows of the filtered GeoDataFramemanhattan_crashes.head()
Number of crashes within manhattan: 31042
CRASH DATE
CRASH TIME
BOROUGH
ZIP CODE
LATITUDE
LONGITUDE
LOCATION
ON STREET NAME
CROSS STREET NAME
OFF STREET NAME
NUMBER OF PERSONS INJURED
NUMBER OF PERSONS KILLED
NUMBER OF PEDESTRIANS INJURED
NUMBER OF PEDESTRIANS KILLED
NUMBER OF CYCLIST INJURED
NUMBER OF CYCLIST KILLED
NUMBER OF MOTORIST INJURED
NUMBER OF MOTORIST KILLED
CONTRIBUTING FACTOR VEHICLE 1
CONTRIBUTING FACTOR VEHICLE 2
CONTRIBUTING FACTOR VEHICLE 3
CONTRIBUTING FACTOR VEHICLE 4
CONTRIBUTING FACTOR VEHICLE 5
COLLISION_ID
VEHICLE TYPE CODE 1
VEHICLE TYPE CODE 2
VEHICLE TYPE CODE 3
VEHICLE TYPE CODE 4
VEHICLE TYPE CODE 5
geometry
0
05/01/2021
13:30
MANHATTAN
10029.0
40.796300
-73.938290
(40.7963, -73.93829)
EAST 115 STREET
2 AVENUE
NaN
0
0
0
0
0
0
0
0
Passing or Lane Usage Improper
Unspecified
NaN
NaN
NaN
4412937
Bus
Sedan
NaN
NaN
NaN
POINT (-73.93829 40.79630)
1
05/01/2021
17:50
MANHATTAN
10012.0
40.720936
-73.993805
(40.720936, -73.993805)
BOWERY
SPRING STREET
NaN
1
0
0
0
0
0
1
0
Driver Inattention/Distraction
Unspecified
NaN
NaN
NaN
4412445
Sedan
Sedan
NaN
NaN
NaN
POINT (-73.99380 40.72094)
2
05/01/2021
13:30
MANHATTAN
10128.0
40.780693
-73.946600
(40.780693, -73.9466)
EAST 92 STREET
1 AVENUE
NaN
0
0
0
0
0
0
0
0
Driver Inattention/Distraction
Unspecified
NaN
NaN
NaN
4414390
AMBULANCE
Sedan
NaN
NaN
NaN
POINT (-73.94660 40.78069)
3
05/01/2021
9:40
MANHATTAN
10026.0
40.800537
-73.948360
(40.800537, -73.94836)
NaN
NaN
40 WEST 115 STREET
0
0
0
0
0
0
0
0
Backing Unsafely
Unspecified
NaN
NaN
NaN
4417017
Station Wagon/Sport Utility Vehicle
NaN
NaN
NaN
NaN
POINT (-73.94836 40.80054)
4
05/01/2021
23:03
MANHATTAN
10009.0
40.726864
-73.979910
(40.726864, -73.97991)
AVENUE B
EAST 10 STREET
NaN
1
0
0
0
1
0
0
0
Driver Inattention/Distraction
Driver Inattention/Distraction
NaN
NaN
NaN
4412243
Bike
NaN
NaN
NaN
NaN
POINT (-73.97991 40.72686)
7 Re-project our data into an approriate CRS2263
import osmnx as ox# Assuming G is your graph object# Project the graph to the Philadelphia state plane CRS (EPSG:2272)G_projected = ox.project_graph(G, to_crs='EPSG:2263')# Project the crash GeoDataFrame to the Philadelphia state plane CRS (EPSG:2272)manhattan_crashes_projected = manhattan_crashes.to_crs(epsg=2263)# Display the first few rows of the projected GeoDataFramemanhattan_crashes_projected.head()# Create a plotfig, ax = plt.subplots(figsize=(12, 12))# Plot the street networkedges_gdf_projected = ox.graph_to_gdfs(G_projected, nodes=False)edges_gdf_projected.plot(ax=ax, linewidth=1, edgecolor='gray', label='Street Network')# Plot the crash locationsmanhattan_crashes_projected.plot(ax=ax, marker='o', color='red', markersize=5, label='Crashes')# Add a title and legendplt.title('Crash Locations in Manhattan with Street Network')plt.legend()# Show the plotplt.show()
1.8 Find the nearest edge for each crash
See: ox.distance.nearest_edges(). It takes three arguments:
the network graph the longitude of your crash data (the x attribute of the geometry column) the latitude of your crash data (the y attribute of the geometry column) You will get a numpy array with 3 columns that represent (u, v, key) where each u and v are the node IDs that the edge links together. We will ignore the key value for our analysis.
# Extract x and y coordinates from the geometry columncrash_x = manhattan_crashes_projected.geometry.xcrash_y = manhattan_crashes_projected.geometry.y# Find the nearest edges for each crashnearest_edges = ox.distance.nearest_edges(G_projected, crash_x, crash_y)# Convert the result to a numpy arraynearest_edges_array = np.array(nearest_edges)# Display the first few resultsprint(nearest_edges_array[:5])# Extract only the u and v columns, ignoring the keynearest_edges_uv = nearest_edges_array[:, :2]# Display the first few u, v pairsprint(nearest_edges_uv[:5])
1.9 Calculate the total number of crashes per street
# Create a DataFrame from the nearest edges dataedges_df = pd.DataFrame(nearest_edges_array, columns=['u', 'v', 'key'])# Group by 'u' and 'v' and calculate the size of each groupcrash_counts = edges_df.groupby(['u', 'v']).size().reset_index(name='crash_count')# Display the resulting DataFramecrash_counts
u
v
crash_count
0
42421728
42432736
2
1
42421731
42437916
1
2
42421737
42437917
2
3
42421741
42432756
1
4
42421751
42421749
1
...
...
...
...
5796
12162436970
42455357
2
5797
12181309686
4597668039
5
5798
12299314857
12299314860
1
5799
12299314860
42438476
3
5800
12374690312
42433537
1
5801 rows × 3 columns
1.10 Merge your edges GeoDataFrame and crash count DataFrame
# Convert the projected graph to a GeoDataFrame for edgesedges_gdf_projected = ox.graph_to_gdfs(G_projected, nodes=False)# Merge the edges GeoDataFrame with the crash counts DataFramemerged_df = edges_gdf_projected.merge(crash_counts, on=['u', 'v'], how='left')# Fill missing crash count values with zeromerged_df['crash_count'] = merged_df['crash_count'].fillna(0)# Display the first few rows of the merged DataFramemerged_df# Filter out rows where crash_count is 0.0filtered_df = merged_df[merged_df['crash_count'] >0.0]# Display the first few rows of the filtered DataFramefiltered_df
u
v
osmid
name
highway
maxspeed
oneway
reversed
length
geometry
lanes
ref
access
bridge
tunnel
width
junction
crash_count
2
42421728
42432736
[1271523197, 1271523198]
Central Park West
secondary
25 mph
False
False
86.275
LINESTRING (995312.767 230030.016, 995334.152 ...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
2.0
3
42435337
42437916
5670640
West 105th Street
residential
25 mph
True
False
137.996
LINESTRING (995176.877 229785.340, 995144.253 ...
1
NaN
NaN
NaN
NaN
NaN
NaN
1.0
6
42421731
42437916
5671485
Manhattan Avenue
residential
NaN
False
True
86.149
LINESTRING (994916.519 230250.770, 994899.394 ...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
1.0
11
42432736
42435341
1271523197
Central Park West
secondary
25 mph
False
False
80.116
LINESTRING (995450.120 230277.316, 995461.822 ...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
2.0
13
42437916
42437917
5670640
West 105th Street
residential
25 mph
True
False
135.012
LINESTRING (994779.437 230003.728, 994751.078 ...
1
NaN
NaN
NaN
NaN
NaN
NaN
8.0
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
9864
7802856372
7802856349
661227257
Central Park West
secondary
25 mph
False
True
80.457
LINESTRING (990516.812 221373.627, 990505.794 ...
4
NaN
NaN
NaN
NaN
NaN
NaN
1.0
9865
7802856372
7802856356
[1271523171, 1271523172]
Central Park West
secondary
25 mph
False
False
79.496
LINESTRING (990516.812 221373.627, 990527.802 ...
4
NaN
NaN
NaN
NaN
NaN
NaN
4.0
9867
8288270047
246580982
5671698
West 16th Street
residential
25 mph
True
False
21.068
LINESTRING (981879.246 210378.461, 981886.366 ...
1
NaN
NaN
NaN
NaN
NaN
NaN
6.0
9869
8840333851
42453952
5672377
Church Street
secondary
25 mph
True
False
83.590
LINESTRING (981444.123 198698.940, 981458.126 ...
3
NaN
NaN
NaN
NaN
NaN
NaN
2.0
9878
11942111842
42434962
[658488325, 658499796, 658499797, 420872214, 6...
NaN
motorway_link
NaN
True
False
290.747
LINESTRING (991424.925 211158.515, 991364.955 ...
[2, 1, 3]
NaN
NaN
NaN
NaN
NaN
NaN
1.0
5805 rows × 18 columns
1.11 Calculate a “Crash Index”
# Step 1: Calculate the crash indexfiltered_df['crash_index'] = np.log10(filtered_df['crash_count'] / filtered_df['length'])# Step 2: Normalize the crash indexmin_crash_index = filtered_df['crash_index'].min()max_crash_index = filtered_df['crash_index'].max()# Normalize the crash_index to a 0-1 scalefiltered_df['crash_index_normalized'] = (filtered_df['crash_index'] - min_crash_index) / (max_crash_index - min_crash_index)filtered_df
C:\Users\txx11\mambaforge\envs\musa-550-fall-2023\lib\site-packages\geopandas\geodataframe.py:1538: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
super().__setitem__(key, value)
C:\Users\txx11\mambaforge\envs\musa-550-fall-2023\lib\site-packages\geopandas\geodataframe.py:1538: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
super().__setitem__(key, value)
u
v
osmid
name
highway
maxspeed
oneway
reversed
length
geometry
lanes
ref
access
bridge
tunnel
width
junction
crash_count
crash_index
crash_index_normalized
2
42421728
42432736
[1271523197, 1271523198]
Central Park West
secondary
25 mph
False
False
86.275
LINESTRING (995312.767 230030.016, 995334.152 ...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
2.0
-1.634855
0.384469
3
42435337
42437916
5670640
West 105th Street
residential
25 mph
True
False
137.996
LINESTRING (995176.877 229785.340, 995144.253 ...
1
NaN
NaN
NaN
NaN
NaN
NaN
1.0
-2.139866
0.244838
6
42421731
42437916
5671485
Manhattan Avenue
residential
NaN
False
True
86.149
LINESTRING (994916.519 230250.770, 994899.394 ...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
1.0
-1.935250
0.301413
11
42432736
42435341
1271523197
Central Park West
secondary
25 mph
False
False
80.116
LINESTRING (995450.120 230277.316, 995461.822 ...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
2.0
-1.602689
0.393363
13
42437916
42437917
5670640
West 105th Street
residential
25 mph
True
False
135.012
LINESTRING (994779.437 230003.728, 994751.078 ...
1
NaN
NaN
NaN
NaN
NaN
NaN
8.0
-1.227282
0.497160
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
9864
7802856372
7802856349
661227257
Central Park West
secondary
25 mph
False
True
80.457
LINESTRING (990516.812 221373.627, 990505.794 ...
4
NaN
NaN
NaN
NaN
NaN
NaN
1.0
-1.905564
0.309621
9865
7802856372
7802856356
[1271523171, 1271523172]
Central Park West
secondary
25 mph
False
False
79.496
LINESTRING (990516.812 221373.627, 990527.802 ...
4
NaN
NaN
NaN
NaN
NaN
NaN
4.0
-1.298285
0.477528
9867
8288270047
246580982
5671698
West 16th Street
residential
25 mph
True
False
21.068
LINESTRING (981879.246 210378.461, 981886.366 ...
1
NaN
NaN
NaN
NaN
NaN
NaN
6.0
-0.545472
0.685674
9869
8840333851
42453952
5672377
Church Street
secondary
25 mph
True
False
83.590
LINESTRING (981444.123 198698.940, 981458.126 ...
3
NaN
NaN
NaN
NaN
NaN
NaN
2.0
-1.621124
0.388266
9878
11942111842
42434962
[658488325, 658499796, 658499797, 420872214, 6...
NaN
motorway_link
NaN
True
False
290.747
LINESTRING (991424.925 211158.515, 991364.955 ...
[2, 1, 3]
NaN
NaN
NaN
NaN
NaN
NaN
1.0
-2.463515
0.155352
5805 rows × 20 columns
1.12 Plot a histogram of the crash index values
import matplotlib.pyplot as plt# Assuming filtered_df is already defined and contains 'crash_index_normalized'# Plot a histogram of the normalized crash index valuesplt.figure(figsize=(10, 6))plt.hist(filtered_df['crash_index_normalized'], bins=30, color='skyblue', edgecolor='black')plt.title('Histogram of Normalized Crash Index')plt.xlabel('Normalized Crash Index')plt.ylabel('Frequency')plt.grid(axis='y', alpha=0.75)# Show the plotplt.show()
1.13 Plot an interactive map of the street networks, colored by the crash index
import foliumimport geopandas as gpdimport matplotlib.pyplot as plt# Assuming 'filtered_df' is your GeoDataFrame with the 'crash_index_normalized' column# Create a base map centered around the Central district with a dark themem = folium.Map(location=[40.7826, -73.9656], zoom_start=12, tiles='CartoDB dark_matter')# Define a function to style the lines based on the crash indexdef style_function(feature): crash_index = feature['properties']['crash_index_normalized']# Use the 'viridis' colormap for a color gradient colormap = plt.cm.get_cmap('viridis')# Get the RGBA color based on the crash index color = colormap(crash_index) # crash_index should already be normalized [0, 1]# Convert RGBA to hex color_hex ='#{:02x}{:02x}{:02x}'.format(int(color[0]*255), int(color[1]*255), int(color[2]*255))return {'color': color_hex,'weight': 3+ crash_index *2, # Increase line weight for higher crash index'opacity': 0.8 }# Add the GeoDataFrame to the mapfolium.GeoJson( filtered_df, style_function=style_function, tooltip=folium.GeoJsonTooltip(fields=['name', 'crash_index_normalized']),).add_to(m)# Display the mapm
C:\Users\txx11\AppData\Local\Temp\ipykernel_2124\3546299534.py:14: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.
colormap = plt.cm.get_cmap('viridis')
Make this Notebook Trusted to load map: File -> Trust Notebook
m.save('mahattan_crash_index_map_dark.html')
C:\Users\txx11\AppData\Local\Temp\ipykernel_2124\3546299534.py:14: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.
colormap = plt.cm.get_cmap('viridis')
Part 2: Density Map of Crashes by time of the day
#load large datadat2 = pd.read_csv("Crash data_large.csv")
if'LATITUDE'in dat2.columns and'LONGITUDE'in data.columns:# Create a geometry column using the DEC_LAT and DEC_LONG columns geometry = [Point(xy) for xy inzip(data['LONGITUDE'], data['LATITUDE'])]# Create a GeoDataFrame gdf2 = gpd.GeoDataFrame(data, geometry=geometry)# Set the coordinate reference system (CRS) to WGS 84 (EPSG:4326) gdf2.set_crs(epsg=4326, inplace=True)# Filter the crash GeoDataFrame to only include crashes within the boundarydat2 = gdf[gdf.geometry.within(manhattan_boundary)]# Display the number of crashes within the Center City boundaryprint(f"Number of crashes within manhattan: {len(manhattan_crashes)}")# Display the first few rows of the filtered GeoDataFramedat2.head()
Number of crashes within manhattan: 31042
CRASH DATE
CRASH TIME
BOROUGH
ZIP CODE
LATITUDE
LONGITUDE
LOCATION
ON STREET NAME
CROSS STREET NAME
OFF STREET NAME
NUMBER OF PERSONS INJURED
NUMBER OF PERSONS KILLED
NUMBER OF PEDESTRIANS INJURED
NUMBER OF PEDESTRIANS KILLED
NUMBER OF CYCLIST INJURED
NUMBER OF CYCLIST KILLED
NUMBER OF MOTORIST INJURED
NUMBER OF MOTORIST KILLED
CONTRIBUTING FACTOR VEHICLE 1
CONTRIBUTING FACTOR VEHICLE 2
CONTRIBUTING FACTOR VEHICLE 3
CONTRIBUTING FACTOR VEHICLE 4
CONTRIBUTING FACTOR VEHICLE 5
COLLISION_ID
VEHICLE TYPE CODE 1
VEHICLE TYPE CODE 2
VEHICLE TYPE CODE 3
VEHICLE TYPE CODE 4
VEHICLE TYPE CODE 5
geometry
0
05/01/2021
13:30
MANHATTAN
10029.0
40.796300
-73.938290
(40.7963, -73.93829)
EAST 115 STREET
2 AVENUE
NaN
0
0
0
0
0
0
0
0
Passing or Lane Usage Improper
Unspecified
NaN
NaN
NaN
4412937
Bus
Sedan
NaN
NaN
NaN
POINT (-73.93829 40.79630)
1
05/01/2021
17:50
MANHATTAN
10012.0
40.720936
-73.993805
(40.720936, -73.993805)
BOWERY
SPRING STREET
NaN
1
0
0
0
0
0
1
0
Driver Inattention/Distraction
Unspecified
NaN
NaN
NaN
4412445
Sedan
Sedan
NaN
NaN
NaN
POINT (-73.99380 40.72094)
2
05/01/2021
13:30
MANHATTAN
10128.0
40.780693
-73.946600
(40.780693, -73.9466)
EAST 92 STREET
1 AVENUE
NaN
0
0
0
0
0
0
0
0
Driver Inattention/Distraction
Unspecified
NaN
NaN
NaN
4414390
AMBULANCE
Sedan
NaN
NaN
NaN
POINT (-73.94660 40.78069)
3
05/01/2021
9:40
MANHATTAN
10026.0
40.800537
-73.948360
(40.800537, -73.94836)
NaN
NaN
40 WEST 115 STREET
0
0
0
0
0
0
0
0
Backing Unsafely
Unspecified
NaN
NaN
NaN
4417017
Station Wagon/Sport Utility Vehicle
NaN
NaN
NaN
NaN
POINT (-73.94836 40.80054)
4
05/01/2021
23:03
MANHATTAN
10009.0
40.726864
-73.979910
(40.726864, -73.97991)
AVENUE B
EAST 10 STREET
NaN
1
0
0
0
1
0
0
0
Driver Inattention/Distraction
Driver Inattention/Distraction
NaN
NaN
NaN
4412243
Bike
NaN
NaN
NaN
NaN
POINT (-73.97991 40.72686)
# Convert 'CRASH TIME' to datetime format if it's not already in datetimedat2['CRASH TIME'] = pd.to_datetime(dat2['CRASH TIME'], format='%H:%M')# Extract the hour from 'CRASH TIME' and create a new column called 'CRASH HOUR'dat2['CRASH HOUR'] = dat2['CRASH TIME'].dt.hourdat2
C:\Users\txx11\mambaforge\envs\musa-550-fall-2023\lib\site-packages\geopandas\geodataframe.py:1538: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
super().__setitem__(key, value)
C:\Users\txx11\mambaforge\envs\musa-550-fall-2023\lib\site-packages\geopandas\geodataframe.py:1538: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
super().__setitem__(key, value)
CRASH DATE
CRASH TIME
BOROUGH
ZIP CODE
LATITUDE
LONGITUDE
LOCATION
ON STREET NAME
CROSS STREET NAME
OFF STREET NAME
NUMBER OF PERSONS INJURED
NUMBER OF PERSONS KILLED
NUMBER OF PEDESTRIANS INJURED
NUMBER OF PEDESTRIANS KILLED
NUMBER OF CYCLIST INJURED
NUMBER OF CYCLIST KILLED
NUMBER OF MOTORIST INJURED
NUMBER OF MOTORIST KILLED
CONTRIBUTING FACTOR VEHICLE 1
CONTRIBUTING FACTOR VEHICLE 2
CONTRIBUTING FACTOR VEHICLE 3
CONTRIBUTING FACTOR VEHICLE 4
CONTRIBUTING FACTOR VEHICLE 5
COLLISION_ID
VEHICLE TYPE CODE 1
VEHICLE TYPE CODE 2
VEHICLE TYPE CODE 3
VEHICLE TYPE CODE 4
VEHICLE TYPE CODE 5
geometry
CRASH HOUR
0
05/01/2021
1900-01-01 13:30:00
MANHATTAN
10029.0
40.796300
-73.938290
(40.7963, -73.93829)
EAST 115 STREET
2 AVENUE
NaN
0
0
0
0
0
0
0
0
Passing or Lane Usage Improper
Unspecified
NaN
NaN
NaN
4412937
Bus
Sedan
NaN
NaN
NaN
POINT (-73.93829 40.79630)
13
1
05/01/2021
1900-01-01 17:50:00
MANHATTAN
10012.0
40.720936
-73.993805
(40.720936, -73.993805)
BOWERY
SPRING STREET
NaN
1
0
0
0
0
0
1
0
Driver Inattention/Distraction
Unspecified
NaN
NaN
NaN
4412445
Sedan
Sedan
NaN
NaN
NaN
POINT (-73.99380 40.72094)
17
2
05/01/2021
1900-01-01 13:30:00
MANHATTAN
10128.0
40.780693
-73.946600
(40.780693, -73.9466)
EAST 92 STREET
1 AVENUE
NaN
0
0
0
0
0
0
0
0
Driver Inattention/Distraction
Unspecified
NaN
NaN
NaN
4414390
AMBULANCE
Sedan
NaN
NaN
NaN
POINT (-73.94660 40.78069)
13
3
05/01/2021
1900-01-01 09:40:00
MANHATTAN
10026.0
40.800537
-73.948360
(40.800537, -73.94836)
NaN
NaN
40 WEST 115 STREET
0
0
0
0
0
0
0
0
Backing Unsafely
Unspecified
NaN
NaN
NaN
4417017
Station Wagon/Sport Utility Vehicle
NaN
NaN
NaN
NaN
POINT (-73.94836 40.80054)
9
4
05/01/2021
1900-01-01 23:03:00
MANHATTAN
10009.0
40.726864
-73.979910
(40.726864, -73.97991)
AVENUE B
EAST 10 STREET
NaN
1
0
0
0
1
0
0
0
Driver Inattention/Distraction
Driver Inattention/Distraction
NaN
NaN
NaN
4412243
Bike
NaN
NaN
NaN
NaN
POINT (-73.97991 40.72686)
23
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
32364
12/31/2023
1900-01-01 23:18:00
MANHATTAN
10030.0
40.819670
-73.944240
(40.81967, -73.94424)
8 AVENUE
WEST 140 STREET
NaN
0
0
0
0
0
0
0
0
Driver Inattention/Distraction
NaN
NaN
NaN
NaN
4692572
Sedan
NaN
NaN
NaN
NaN
POINT (-73.94424 40.81967)
23
32365
12/31/2023
1900-01-01 18:03:00
MANHATTAN
10039.0
40.824130
-73.940980
(40.82413, -73.94098)
8 AVENUE
WEST 147 STREET
NaN
1
0
1
0
0
0
0
0
Unspecified
NaN
NaN
NaN
NaN
4692571
NaN
NaN
NaN
NaN
NaN
POINT (-73.94098 40.82413)
18
32366
12/31/2023
1900-01-01 14:00:00
MANHATTAN
10028.0
40.777890
-73.955890
(40.77789, -73.95589)
NaN
NaN
160 EAST 84 STREET
0
0
0
0
0
0
0
0
Driver Inattention/Distraction
Unspecified
NaN
NaN
NaN
4692524
Sedan
Sedan
NaN
NaN
NaN
POINT (-73.95589 40.77789)
14
32367
12/31/2023
1900-01-01 21:34:00
MANHATTAN
10033.0
40.849308
-73.931920
(40.849308, -73.93192)
WEST 182 STREET
AUDUBON AVENUE
NaN
0
0
0
0
0
0
0
0
Unspecified
Unspecified
NaN
NaN
NaN
4692192
Station Wagon/Sport Utility Vehicle
Sedan
NaN
NaN
NaN
POINT (-73.93192 40.84931)
21
32368
12/31/2023
1900-01-01 00:38:00
MANHATTAN
10006.0
40.709496
-74.013900
(40.709496, -74.0139)
ALBANY STREET
WASHINGTON STREET
NaN
0
0
0
0
0
0
0
0
Other Vehicular
Unspecified
NaN
NaN
NaN
4692585
Sedan
Pick-up Truck
NaN
NaN
NaN
POINT (-74.01390 40.70950)
0
31042 rows × 31 columns
from colorcet import fireimport hvplot.pandasimport holoviews as hvimport geoviews as gv
plot1 = dat2.hvplot.points( geo=True, # Enables geographic plotting x='LONGITUDE', # Longitude for x-axis y='LATITUDE', # Latitude for y-axis frame_width=800, # Set frame width frame_height=600, # Set frame height cmap=fire, # Use the Fire colormap datashade=True, # Enable datashading for large datasets crs=4326, title='Manhattan Crashes'# Set the plot title)# Add a dark background mapbg = gv.tile_sources.CartoDark# Combine the background map and the plotbg * plot1
plot2 = dat2.hvplot.points( geo=True, # Enables geographic plotting x='LONGITUDE', # Longitude for x-axis y='LATITUDE', # Latitude for y-axis frame_width=800, # Set frame width frame_height=600, # Set frame height cmap=fire, # Use the Fire colormap datashade=True, # Enable datashading for large datasets crs=4326, groupby ="CRASH HOUR", title='Manhattan Crashes'# Set the plot title)# Add a dark background mapbg = gv.tile_sources.CartoDark# Combine the background map and the plotbg * plot2
Part 3: Density Map of Crashes due to pass/following too closely
import altair as alt# Count the occurrences of each contributing factorfactor_counts = manhattan_crashes["CONTRIBUTING FACTOR VEHICLE 1"].value_counts().reset_index()factor_counts.columns = ["Contributing Factor", "Frequency"]# Select the top 10 contributing factorstop_10_factors = factor_counts.head(10)# Create a bar chart using Altairchart = alt.Chart(top_10_factors).mark_bar().encode( x=alt.X("Frequency:Q", title="Frequency"), y=alt.Y("Contributing Factor:N", sort='-x', title="Contributing Factor"), color=alt.Color("Contributing Factor:N", legend=None), # Color by factor, no legend tooltip=[ alt.Tooltip("Contributing Factor:N", title="Factor"), alt.Tooltip("Frequency:Q", title="Count") ]).properties( title="Top 10 Contributing Factors for Vehicle Crashes in Manhattan (2021-2023)", width=600, height=400)# Display the chartchart
manhattan_crashes_filtered = manhattan_crashes[ (manhattan_crashes["CONTRIBUTING FACTOR VEHICLE 1"] =="Passing Too Closely") | (manhattan_crashes["CONTRIBUTING FACTOR VEHICLE 1"] =="Following Too Closely")]manhattan_crashes_filtered
CRASH DATE
CRASH TIME
BOROUGH
ZIP CODE
LATITUDE
LONGITUDE
LOCATION
ON STREET NAME
CROSS STREET NAME
OFF STREET NAME
NUMBER OF PERSONS INJURED
NUMBER OF PERSONS KILLED
NUMBER OF PEDESTRIANS INJURED
NUMBER OF PEDESTRIANS KILLED
NUMBER OF CYCLIST INJURED
NUMBER OF CYCLIST KILLED
NUMBER OF MOTORIST INJURED
NUMBER OF MOTORIST KILLED
CONTRIBUTING FACTOR VEHICLE 1
CONTRIBUTING FACTOR VEHICLE 2
CONTRIBUTING FACTOR VEHICLE 3
CONTRIBUTING FACTOR VEHICLE 4
CONTRIBUTING FACTOR VEHICLE 5
COLLISION_ID
VEHICLE TYPE CODE 1
VEHICLE TYPE CODE 2
VEHICLE TYPE CODE 3
VEHICLE TYPE CODE 4
VEHICLE TYPE CODE 5
geometry
5
05/01/2021
3:01
MANHATTAN
10032.0
40.832886
-73.944020
(40.832886, -73.94402)
NaN
NaN
555 WEST 156 STREET
0
0
0
0
0
0
0
0
Passing Too Closely
Unspecified
NaN
NaN
NaN
4413557
Taxi
Station Wagon/Sport Utility Vehicle
NaN
NaN
NaN
POINT (-73.94402 40.83289)
20
05/01/2021
13:54
MANHATTAN
10036.0
40.761300
-73.999435
(40.7613, -73.999435)
NaN
NaN
635 WEST 42 STREET
0
0
0
0
0
0
0
0
Passing Too Closely
Unspecified
NaN
NaN
NaN
4413013
Station Wagon/Sport Utility Vehicle
NaN
NaN
NaN
NaN
POINT (-73.99944 40.76130)
22
05/01/2021
17:55
MANHATTAN
10029.0
40.799984
-73.944855
(40.799984, -73.944855)
EAST 116 STREET
MADISON AVENUE
NaN
0
0
0
0
0
0
0
0
Passing Too Closely
Unspecified
NaN
NaN
NaN
4412865
Sedan
NaN
NaN
NaN
NaN
POINT (-73.94486 40.79998)
34
05/01/2021
9:45
MANHATTAN
10035.0
40.802753
-73.933580
(40.802753, -73.93358)
EAST 125 STREET
2 AVENUE
NaN
0
0
0
0
0
0
0
0
Following Too Closely
Unspecified
NaN
NaN
NaN
4412859
Sedan
Box Truck
NaN
NaN
NaN
POINT (-73.93358 40.80275)
46
05/02/2021
12:15
MANHATTAN
10037.0
40.810024
-73.937540
(40.810024, -73.93754)
NaN
NaN
2096 MADISON AVENUE
0
0
0
0
0
0
0
0
Following Too Closely
Unspecified
NaN
NaN
NaN
4412870
Sedan
NaN
NaN
NaN
NaN
POINT (-73.93754 40.81002)
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
32333
12/30/2023
3:04
MANHATTAN
10029.0
40.790817
-73.942880
(40.790817, -73.94288)
NaN
NaN
231 EAST 106 STREET
0
0
0
0
0
0
0
0
Passing Too Closely
Unspecified
NaN
NaN
NaN
4691754
Station Wagon/Sport Utility Vehicle
Station Wagon/Sport Utility Vehicle
NaN
NaN
NaN
POINT (-73.94288 40.79082)
32340
12/30/2023
17:40
MANHATTAN
10001.0
40.747234
-73.993370
(40.747234, -73.99337)
WEST 28 STREET
7 AVENUE
NaN
1
0
0
0
0
0
1
0
Following Too Closely
Unspecified
NaN
NaN
NaN
4692517
Taxi
Box Truck
NaN
NaN
NaN
POINT (-73.99337 40.74723)
32349
12/31/2023
22:40
MANHATTAN
10019.0
40.767130
-73.993730
(40.76713, -73.99373)
11 AVENUE
WEST 52 STREET
NaN
0
0
0
0
0
0
0
0
Following Too Closely
Turning Improperly
NaN
NaN
NaN
4693643
Station Wagon/Sport Utility Vehicle
Bus
NaN
NaN
NaN
POINT (-73.99373 40.76713)
32351
12/31/2023
16:24
MANHATTAN
10027.0
40.809310
-73.949120
(40.80931, -73.94912)
NaN
NaN
215 WEST 125 STREET
0
0
0
0
0
0
0
0
Passing Too Closely
Unspecified
NaN
NaN
NaN
4693991
Sedan
Sedan
NaN
NaN
NaN
POINT (-73.94912 40.80931)
32359
12/31/2023
21:16
MANHATTAN
10011.0
40.738250
-74.001080
(40.73825, -74.00108)
NaN
NaN
237 WEST 13 STREET
0
0
0
0
0
0
0
0
Passing Too Closely
Unspecified
NaN
NaN
NaN
4691995
Station Wagon/Sport Utility Vehicle
NaN
NaN
NaN
NaN
POINT (-74.00108 40.73825)
2789 rows × 30 columns
Add hour
# Convert 'CRASH TIME' to datetime format if it's not already in datetimemanhattan_crashes_filtered['CRASH TIME'] = pd.to_datetime(manhattan_crashes_filtered['CRASH TIME'], format='%H:%M')# Extract the hour from 'CRASH TIME' and create a new column called 'CRASH HOUR'manhattan_crashes_filtered['CRASH HOUR'] = manhattan_crashes_filtered['CRASH TIME'].dt.hourmanhattan_crashes_filtered
C:\Users\txx11\mambaforge\envs\musa-550-fall-2023\lib\site-packages\geopandas\geodataframe.py:1538: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
super().__setitem__(key, value)
C:\Users\txx11\mambaforge\envs\musa-550-fall-2023\lib\site-packages\geopandas\geodataframe.py:1538: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
super().__setitem__(key, value)
# Ensure the data contains valid longitude and latitude valuesmanhattan_crashes_filtered = manhattan_crashes_filtered.dropna(subset=['LONGITUDE', 'LATITUDE'])# Create a base map centered around Manhattanm = folium.Map(location=[40.7580, -73.9851], zoom_start=12, tiles='CartoDB dark_matter')# Add crash points to the map using FastMarkerClusterFastMarkerCluster(data=manhattan_crashes_filtered[['LATITUDE', 'LONGITUDE']].values.tolist()).add_to(m)m
Make this Notebook Trusted to load map: File -> Trust Notebook
Part2: Congestion Prediction
Introduction
The purpose of this project is to design a predictive model for traffic congestion using a set of environmental and contextual features, including temperature, precipitation, wind speed, the occurrence of events, and whether it is a weekend. Traffic congestion is a significant issue in urban areas, impacting commute times, air quality, and overall productivity. By leveraging these variables, the model aims to understand the factors influencing traffic patterns and provide accurate predictions of traffic counts. Such a model could be instrumental in improving traffic management systems, informing infrastructure planning, and helping commuters make more informed decisions. The project seeks to demonstrate the feasibility of using readily available data to address real-world urban challenges.
A traffic prediction model has significant potential applications in optimizing traffic light systems to improve traffic flow and reduce congestion. By accurately predicting traffic counts based on environmental factors, events, and time-related variables, the model could serve as a critical input for adaptive traffic light control systems. For instance, the model could help dynamically adjust traffic light timings based on anticipated traffic volumes at specific intersections. During periods of high predicted traffic, longer green light durations could be allocated to heavily congested routes, while during low-traffic periods, shorter cycles could minimize unnecessary delays. This would ensure a more efficient allocation of green time, reducing wait times, fuel consumption, and emissions caused by idling vehicles.
Additionally, the model could be integrated into intelligent traffic management systems that coordinate traffic lights across multiple intersections. By predicting traffic patterns in advance, the system could optimize signal synchronization to create “green waves,” allowing vehicles to travel through a series of intersections without stopping. This approach could be particularly useful in urban areas with high traffic density, where poor signal coordination often exacerbates congestion. Furthermore, during special events or adverse weather conditions, the model could help traffic authorities proactively adjust signal timings to handle anticipated surges in traffic, minimizing disruptions. Overall, the integration of traffic prediction models into traffic light optimization systems has the potential to enhance urban mobility, reduce congestion, and improve the overall efficiency of transportation networks.
Importing The NYC Weather Data
Time: 05-01-2021 to 05-10-2021
Data Source:
import pandas as pdimport matplotlib.pyplot as pltimport seaborn as sns# Step 1: Load the Data# Load the dataset directly from the fileweather_data = pd.read_csv("weather_Panel_NY.csv")# Convert the 'interval60' column to datetime formatweather_data['interval60'] = pd.to_datetime(weather_data['interval60'])# Step 2: Data Cleaning and Processing# Replace invalid or missing temperature values (if any)weather_data['Temperature'] = weather_data['Temperature'].apply(lambda x: 42if x ==0else x)# Step 3: Define Plot Themesdef plot_theme(ax):"""Apply a consistent theme to plots.""" ax.set_title(ax.get_title(), fontsize=14, fontweight='bold') ax.set_xlabel(ax.get_xlabel(), fontsize=12) ax.set_ylabel(ax.get_ylabel(), fontsize=12) ax.tick_params(axis='x', labelsize=10, rotation=45) ax.tick_params(axis='y', labelsize=10) ax.grid(color="#eff3ff", linestyle='-', linewidth=0.5) ax.set_facecolor("white") ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['left'].set_visible(False) ax.spines['bottom'].set_visible(False)# Step 4: Create Subplots# Create a figure with 3 subplots (one for each variable)fig, axs = plt.subplots(3, 1, figsize=(12, 14), constrained_layout=True)# Plot 1: Precipitationsns.lineplot(data=weather_data, x='interval60', y='Precipitation', ax=axs[0], color='#1f77b4')axs[0].set_title("Precipitation Over Time")axs[0].set_xlabel("Date and Time")axs[0].set_ylabel("Precipitation (inches)")plot_theme(axs[0])# Plot 2: Wind Speedsns.lineplot(data=weather_data, x='interval60', y='Wind_Speed', ax=axs[1], color='#ff7f0e')axs[1].set_title("Wind Speed Over Time")axs[1].set_xlabel("Date and Time")axs[1].set_ylabel("Wind Speed (mph)")plot_theme(axs[1])# Plot 3: Temperaturesns.lineplot(data=weather_data, x='interval60', y='Temperature', ax=axs[2], color='#2ca02c')axs[2].set_title("Temperature Over Time")axs[2].set_xlabel("Date and Time")axs[2].set_ylabel("Temperature (°F)")plot_theme(axs[2])# Add a main title for the entire figurefig.suptitle("Weather Data - New York City (May 2021)", fontsize=16, fontweight='bold')# Step 5: Show and Save the Plot# Display the plotplt.show()# Optionally save the figure as an image filefig.savefig("weather_data_plot.png", dpi=300)
Exploratory Data Analysis:
Traffic Count Across the Time:
import pandas as pdimport matplotlib.pyplot as pltimport seaborn as sns# Step 1: Load the Data# Load the datasetdata = pd.read_csv("data_filtered.csv")# Step 2: Preprocess the Data# Extract the columns with traffic counts (time intervals)traffic_columns = data.columns[7:] # Assuming traffic data starts from the 8th column# Add a new column for total traffic count per rowdata['Total_Traffic'] = data[traffic_columns].sum(axis=1)# Aggregate traffic counts by datetraffic_by_date = data.groupby("Date")['Total_Traffic'].sum().reset_index()# Convert the 'Date' column to datetime format for proper sortingtraffic_by_date['Date'] = pd.to_datetime(traffic_by_date['Date'])# Sort by datetraffic_by_date = traffic_by_date.sort_values(by='Date')# Step 3: Plot the Data# Set the style of the plotsns.set(style="whitegrid")# Create the plotplt.figure(figsize=(12, 6))plt.plot(traffic_by_date['Date'], traffic_by_date['Total_Traffic'], color='black', linewidth=1)# Add labels and titleplt.title("Number of Trips Over Time", fontsize=14, fontweight='bold')plt.xlabel("Date", fontsize=12)plt.ylabel("Number of Trips", fontsize=12)# Rotate x-axis labels for better readabilityplt.xticks(rotation=45)# Add gridlinesplt.grid(visible=True, linestyle='--', alpha=0.5)# Show the plotplt.tight_layout()plt.show()
Traffic Count Comparing Weekends and Weekdays:
import pandas as pdimport matplotlib.pyplot as pltimport seaborn as snsfrom datetime import datetime# Step 1: Load the Datadata = pd.read_csv("data_filtered.csv")# Step 2: Preprocess the Data# Extract time interval columnstraffic_columns = data.columns[7:] # Assuming traffic data starts from the 8th column# Add a column for total traffic during each hourdata['Total_Traffic'] = data[traffic_columns].sum(axis=1)# Convert the 'Date' column to datetime formatdata['Date'] = pd.to_datetime(data['Date'])# Add a column for the day of the weekdata['Day_of_Week'] = data['Date'].dt.dayofweek# Add a column to classify as 'Weekday' or 'Weekend'data['Weekend'] = data['Day_of_Week'].apply(lambda x: 'Weekend'if x >=5else'Weekday')# Melt the time interval columns into a long formathourly_data = pd.melt(data, id_vars=['Date', 'Weekend'], value_vars=traffic_columns, var_name='Hour', value_name='Traffic_Count')# Clean the 'Hour' column to extract hour valueshourly_data['Hour'] = hourly_data['Hour'].str.extract(r'X(\d+)\.').astype(int)# Step 3: Aggregate the Data# Group by hour and weekend/weekdayhourly_traffic = hourly_data.groupby(['Hour', 'Weekend'])['Traffic_Count'].sum().reset_index()# Step 4: Plot the Datasns.set(style="whitegrid")plt.figure(figsize=(12, 6))# Plot the data with explicit color mappingsns.lineplot(data=hourly_traffic, x='Hour', y='Traffic_Count', hue='Weekend', palette={'Weekday': 'red', 'Weekend': 'blue'}) # Use blue for Weekend# Add labels and titleplt.title("Traffic Counts by Hour: Weekday vs Weekend", fontsize=14, fontweight='bold')plt.xlabel("Hour", fontsize=12)plt.ylabel("Traffic Counts", fontsize=12)# Fix the legend to correctly match the colorsplt.legend(title="Traffic Type", labels=["Weekday (Red)", "Weekend (Blue)"], loc="upper right")# Show the plotplt.tight_layout()plt.show()
Traffic Count Comparing Streets:
import pandas as pdimport matplotlib.pyplot as pltimport seaborn as sns# Step 1: Load the Datadata = pd.read_csv("data_filtered.csv")# Step 2: Preprocess the Data# Extract traffic columnstraffic_columns = data.columns[7:] # Assuming traffic data starts from the 8th column# Add a column for total traffic across all time intervalsdata['Total_Traffic'] = data[traffic_columns].sum(axis=1)# Step 3: Aggregate Traffic by Roadway Nametraffic_by_roadway = data.groupby('Roadway.Name')['Total_Traffic'].sum().reset_index()# Sort by total traffic in descending order for better visualizationtraffic_by_roadway = traffic_by_roadway.sort_values(by='Total_Traffic', ascending=False)# Step 4: Plot the Datasns.set(style="whitegrid")plt.figure(figsize=(12, 6))sns.barplot(data=traffic_by_roadway, x='Total_Traffic', y='Roadway.Name', palette="viridis")# Add labels and titleplt.title("Traffic Count by Roadway Name", fontsize=14, fontweight='bold')plt.xlabel("Total Traffic Count", fontsize=12)plt.ylabel("Roadway Name", fontsize=12)# Show the plotplt.tight_layout()plt.show()
Data Processing:
import pandas as pdimport numpy as npfrom sklearn.model_selection import train_test_splitfrom sklearn.ensemble import RandomForestRegressorfrom sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score# Step 1: Load the Datasets# Load traffic datatraffic_data = pd.read_csv("data_filtered.csv")# Load weather dataweather_data = pd.read_csv("weather_Panel_NY.csv")# Step 2: Preprocess the Traffic Data# Convert 'Date' column to datetimetraffic_data['Date'] = pd.to_datetime(traffic_data['Date'])# Reshape the traffic data from wide to long formathourly_columns = [col for col in traffic_data.columns if'X'in col]traffic_hourly = traffic_data.melt( id_vars=['Date', 'SegmentID', 'Roadway.Name', 'From', 'To', 'Direction'], value_vars=hourly_columns, var_name='Hour', value_name='Traffic_Count')# Extract the hour from the column names (e.g., "X12.00.1.00.AM" to "00:00")traffic_hourly['Hour'] = traffic_hourly['Hour'].str.extract(r'X(\d+)').astype(int) -1# Combine Date and Hour into a single timestamp columntraffic_hourly['Timestamp'] = traffic_hourly['Date'] + pd.to_timedelta(traffic_hourly['Hour'], unit='h')# Drop unnecessary columnstraffic_hourly = traffic_hourly[['Timestamp', 'Traffic_Count']]
Add the ‘is_weekend’ Feature
# Step 3: Add the 'is_weekend' Feature# Extract day of the week from the timestamptraffic_hourly['Day_of_Week'] = traffic_hourly['Timestamp'].dt.dayofweek # Monday=0, Sunday=6traffic_hourly['is_weekend'] = traffic_hourly['Day_of_Week'].apply(lambda x: 1if x >=5else0)# Step 4: Preprocess the Weather Data# Convert 'interval60' to datetimeweather_data['interval60'] = pd.to_datetime(weather_data['interval60'])# Step 5: Merge the Datasets# Merge traffic and weather data on the timestampcombined_data = pd.merge(traffic_hourly, weather_data, left_on='Timestamp', right_on='interval60', how='inner')# Drop unnecessary columnscombined_data = combined_data.drop(columns=['interval60', 'Day_of_Week'])
Add the Holiday/Event Dates Feature
# Step 1: Define the Holiday/Event Dates# List of holiday or event datesholiday_event_dates = ["2021-05-01", # International Workers' Day"2021-05-05", # Cinco de Mayo"2021-05-09"# Mother's Day]# Step 2: Add the 'is_holiday_or_event' Variable# Convert the holiday_event_dates to datetime for comparisonholiday_event_dates = pd.to_datetime(holiday_event_dates)# Add a new column to indicate whether the date is a holiday or eventtraffic_hourly['is_holiday_or_event'] = traffic_hourly['Timestamp'].dt.date.isin(holiday_event_dates.date).astype(int)
Making the model
# Step 3: Merge with Weather Data# (Assume weather data preprocessing and merging steps as before)combined_data = pd.merge(traffic_hourly, weather_data, left_on='Timestamp', right_on='interval60', how='inner')# Step 4: Feature Selection# Add 'is_holiday_or_event' to the featuresX = combined_data[['Temperature', 'Precipitation', 'Wind_Speed', 'is_weekend', 'is_holiday_or_event']]y = combined_data['Traffic_Count']# Step 5: Train-Test Split and Model TrainingX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)model = RandomForestRegressor(random_state=42)model.fit(X_train, y_train)# Step 6: Evaluate the Modely_pred = model.predict(X_test)mae = mean_absolute_error(y_test, y_pred)mse = mean_squared_error(y_test, y_pred)r2 = r2_score(y_test, y_pred)from tabulate import tabulate # Install this with `pip install tabulate`# Step 7: Create a DataFrame to Display Actual, Predicted, MAE, and MSE for Each Rowresults_df = pd.DataFrame({'Actual Value': y_test.values,'Predicted Value': y_pred})# Calculate Absolute Error and Squared Error for each rowresults_df['Absolute Error'] =abs(results_df['Actual Value'] - results_df['Predicted Value'])results_df['Squared Error'] = (results_df['Actual Value'] - results_df['Predicted Value']) **2# Add columns for MAE and MSE (optional, as they are global metrics)results_df['MAE'] = maeresults_df['MSE'] = mse# Round values for better readabilityresults_df = results_df.round({'Actual Value': 2, 'Predicted Value': 2, 'Absolute Error': 2, 'Squared Error': 2, 'MAE': 2, 'MSE': 2})# Display the table using tabulate for a clean formattable = tabulate(results_df.head(10), headers='keys', tablefmt='pretty')print(table)print(f"Model Evaluation Metrics:")print(f"Mean Absolute Error (MAE): {mae}")print(f"Mean Squared Error (MSE): {mse}")print(f"R-squared (R2): {r2}")
The model’s performance, as reflected by the evaluation metrics, indicates significant limitations. The Mean Absolute Error (MAE) of 186.21 suggests that, on average, predictions deviate from actual values by 186 traffic counts, which may be substantial depending on the scale of traffic. The Mean Squared Error (MSE) of 67,145 highlights large errors, particularly influenced by extreme outliers, given the quadratic nature of MSE. Most concerning is the R-squared (R²) value of 0.00065, which indicates that the model explains less than 0.1% of the variance in the data, essentially performing no better than a simple mean-based prediction. This suggests the model fails to capture meaningful patterns in the data, likely due to insufficient features, inadequate complexity, or a mismatch between the data and the model’s assumptions.
The residual plot indicates that the model generally performs well, with residuals centered around zero and no clear non-linear patterns, suggesting it captures the general relationship between predictors and traffic counts. However, the increasing spread of residuals with larger predicted values highlights heteroscedasticity, meaning the model’s errors grow with higher traffic counts, reducing reliability for these predictions. Additionally, the presence of significant outliers (residuals exceeding 1000) suggests that the model struggles to account for unusual traffic conditions, potentially due to missing features such as accidents or events. A slight tendency to underpredict higher traffic counts is also observed, as evidenced by the clustering of residuals below zero.
# Plot actual vs predictedplt.figure(figsize=(8, 6))sns.scatterplot(x=y_test, y=y_pred, alpha=0.6)plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='r', linestyle='--')plt.title("Actual vs Predicted Traffic Counts")plt.xlabel("Actual Traffic Count")plt.ylabel("Predicted Traffic Count")plt.show()
The scatter plot compares actual traffic counts (x-axis) to predicted traffic counts (y-axis) for a Random Forest model, with the red dashed line representing the ideal scenario where predictions perfectly match actual values. The plot reveals several key insights regarding the model’s performance and limitations.
First, the majority of predicted values cluster within a narrow range (approximately 200–400), regardless of the actual traffic counts. This indicates that the model struggles to capture variability in traffic counts, particularly for higher values, leading to underprediction for actual traffic counts above 500. The lack of points along the diagonal line for larger actual counts highlights this systematic error. Additionally, the spread of points becomes more pronounced as actual traffic counts increase, reflecting heteroscedasticity, where prediction errors grow with larger traffic counts. This suggests that the model’s performance deteriorates for higher traffic levels, potentially due to insufficient features to explain variability at these extremes.
The model performs reasonably well for lower traffic counts, as many predictions fall close to the diagonal line in this range. However, the consistent over-concentration of predicted values between 200 and 400 suggests that the model may be biased toward predicting average traffic counts, a limitation likely stemming from the training process or insufficient feature diversity.
# Plot distribution of residualsplt.figure(figsize=(8, 6))sns.histplot(residuals, kde=True, bins=30, color='blue')plt.title("Distribution of Residuals")plt.xlabel("Residuals (Actual - Predicted)")plt.ylabel("Frequency")plt.show()
The residual distribution is right-skewed, with most residuals concentrated between -250 and 250, indicating that the Random Forest model performs reasonably well for the majority of predictions. However, the long positive tail reveals significant underprediction for higher traffic counts, suggesting the model struggles with variability in extreme conditions. This aligns with earlier observations of heteroscedasticity, where prediction errors increase with larger traffic counts. Additionally, extreme residuals exceeding 1000 indicate outliers or unaccounted factors, such as events or anomalies. To improve, consider applying a log transformation to stabilize variance, adding relevant features (e.g., time, weather), and exploring advanced models like Gradient Boosting to better capture complex patterns.
Limitations
Despite its intent, the model faces several limitations that hinder its performance. The low R-squared value indicates that the features used explain only a negligible portion of the variance in traffic counts, suggesting that other critical factors influencing traffic, such as time of day, road capacity, historical traffic trends, or localized disruptions, are missing from the model. Additionally, the model exhibits heteroscedasticity, with residual errors increasing for higher traffic counts, indicating that it struggles to capture variability in extreme conditions. This issue is further compounded by outliers, such as unusual traffic spikes during events or accidents, which the model fails to predict accurately. Moreover, the features included may not fully capture the non-linear and complex relationships between environmental conditions and traffic patterns, limiting the model’s ability to generalize to diverse scenarios. These limitations highlight the need for more comprehensive data and advanced modeling techniques to improve predictive accuracy.
Conclusion
In conclusion, while the project demonstrates the potential of using environmental and contextual features to predict traffic congestion, the results indicate that the current model is insufficient for accurate and reliable predictions. The significant errors and low explanatory power suggest that the complexity of traffic patterns requires a more robust approach. Future efforts should focus on incorporating additional features, such as time of day, historical traffic data, and real-time factors like road closures or accidents. Advanced modeling techniques, such as Gradient Boosting Machines or Neural Networks, could also be explored to better capture non-linear relationships and interactions between variables. Despite its limitations, this project serves as a valuable starting point for understanding the factors influencing traffic congestion and highlights the importance of data-driven approaches in addressing urban mobility challenges.